Working Directory: /Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/ACMG_Penetrance

setwd("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance")
#save.image("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/Environ_2017-01-09.RData")
load("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/Environ_2017-01-09.RData")
\newpage

1 Download, Transform, and Load Data

1.1 Collect ACMG Gene Panel

http://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/

cat(sprintf("Table from ACMG SF v2.0 Paper %s x %s (selected rows):", nrow(ACMG.table), ncol(ACMG.table)))
## Table from ACMG SF v2.0 Paper 60 x 8 (selected rows):
row.names(ACMG.table) <- paste0("N",1:nrow(ACMG.table))
ACMG.table[1:5,] %>% pander
  Phenotype MIM_disorder PMID_Gene_Reviews_entry
N1 Hereditary breast and ovarian cancer 604370|612555 20301425
N2 Hereditary breast and ovarian cancer 604370|612555 20301425
N3 Li-Fraumeni syndrome 151623 20301488
N4 Peutz-Jeghers syndrome 175200 20301443
N5 Lynch syndrome 120435 20301390

Table continues below

  Typical_age_of_onset Gene MIM_gene Inheritance Variants_to_report
N1 Adult BRCA1 113705 AD KP&EP
N2 Adult BRCA2 600185 AD KP&EP
N3 Child/Adult TP53 191170 AD KP&EP
N4 Child/Adult STK11 602216 AD KP&EP
N5 Adult MLH1 120436 AD KP&EP
cat("ACMG-59 Genes:")
## ACMG-59 Genes:
print(ACMG.panel, quote = F)
##  [1] BRCA1   BRCA2   TP53    STK11   MLH1    MSH2    MSH6    PMS2   
##  [9] APC     MUTYH   BMPR1A  SMAD4   VHL     MEN1    RET     PTEN   
## [17] RB1     SDHD    SDHAF2  SDHC    SDHB    TSC1    TSC2    WT1    
## [25] NF2     COL3A1  FBN1    TGFBR1  TGFBR2  SMAD3   ACTA2   MYH11  
## [33] MYBPC3  MYH7    TNNT2   TNNI3   TPM1    MYL3    ACTC1   PRKAG2 
## [41] GLA     MYL2    LMNA    RYR2    PKP2    DSP     DSC2    TMEM43 
## [49] DSG2    KCNQ1   KCNH2   SCN5A   LDLR    APOB    PCSK9   ATP7B  
## [57] OTC     RYR1    CACNA1S


1.2 Download ClinVar VCF

ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz


ClinVar is the central repository for variant interpretations. Relevant information from the VCF includes:
(a) CLNSIG = “Variant Clinical Significance, 0 - Uncertain, 1 - Not provided, 2 - Benign, 3 - Likely benign,
4 - Likely pathogenic, 5 - Pathogenic, 6 - Drug response, 7 - Histocompatibility, 255 - Other”
(b) CLNDBN = “Variant disease name”
(c) CLNDSDBID = “Variant disease database ID”
(d) INTERP = Pathogenicity (likely pathogenic or pathogenic; CLNSIG = 4 or 5)

#Number to interpretation
clnsig_map <- c(0:7,255,-1) %>% setNames(c("Uncertain",
  "Not_Provided","Benign","Likely_Benign","Likely_Pathogenic",
  "Pathogenic","Drug_Response","Histocompatibility","Other","NA")) 
#"no_assertion - No assertion provided, 
#no_criteria - No assertion criteria provided, 
#single - Criteria #provided single submitter, 
#mult - Criteria provided multiple submitters no conflicts, 
#conf - Criteria #provided conflicting interpretations, 
#exp - Reviewed by expert panel, 
#guideline - Practice guideline"
get_clinvar <- function(clinvar_file) {
  file.by.line <- readLines(clinvar_file)
  #file_date <- as.Date(strsplit(file.by.line[2],"=")[[1]][2], "%Y%m%d")
  #system(sprintf("mv %s ClinVar_Reports/clinvar_%s.vcf", clinvar_file, file_date))
  clean.lines <- file.by.line[!grepl("##.*", file.by.line)] #Remove ## comments
  clean.lines[1] <- sub('.', '', clean.lines[1]) #Remove # from header
  input <- read.table(text = paste(clean.lines, collapse = "\n"), header = T, stringsAsFactors = F, 
                      comment.char = "", quote = "", sep = "\t")
  input <- input[nchar(input$REF)==1,] #deletions
  alt_num <- sapply(strsplit(input$ALT,","),length) #number of alts
  acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
  input <- input[nchar(input$ALT)==acceptable_nchar,] #insertions
  input$ALT <- strsplit(input$ALT,",")
  split_all <- strsplit(input$INFO,";")
  has.clndsdbid <- any(grepl("CLNDSDBID", split_all))
  has.clnrevstat <- any(grepl("CLNREVSTAT", split_all))
  
  split_info <- function(name) {
    sapply(split_all, function(entry) {
      entry[grep(name,entry)]
    }) %>% strsplit("=") %>% sapply(function(x) x[2]) %>% strsplit(",")
  }
  input$CLNALLE <- split_info("CLNALLE=") %>% sapply(as.integer)
  input$CLNSIG <- split_info("CLNSIG=")
  input$CLNDBN <- split_info("CLNDBN=")
  if (has.clnrevstat)
    input$CLNREVSTAT <- split_info("CLNREVSTAT=")
  if (has.clndsdbid)
    input$CLNDSDBID <- split_info("CLNDSDBID=")
  #CLNALLE has 0,-1,3,4 --> CLNSIG has 1,2,3,4 --> ALT has 1. 
  taking <- sapply(input$CLNALLE, function(x) x[x>0] ) #Actual elements > 0. Keep these in CLNSIG and ALT 
  taking_loc <- sapply(input$CLNALLE, function(x) which(x>0) )#Tracks locations for keeping in CLNALLE
  keep <- sapply(taking, length)>0 #reduce everything to get rid of 0 and -1
  # Reduce, reduce, reduce. 
  taking <- taking[keep]
  taking_loc <- taking_loc[keep]
  input <- input[keep,]
  
  #Make this more readable
  input$ALT <- sapply(1:nrow(input), function(row) {
    input$ALT[[row]][taking[[row]]]
  })
  
  col_subset <- function(name) {
    sapply(1:nrow(input), function(row) {
      unlist(input[row,name])[taking_loc[[row]]]
    })
  }
  input$CLNSIG <- col_subset("CLNSIG")
  input$CLNALLE <- col_subset("CLNALLE")
  input$CLNDBN <- col_subset("CLNDBN")
  if (has.clnrevstat)
    input$CLNREVSTAT <- col_subset("CLNREVSTAT")
  if (has.clndsdbid)
    input$CLNDSDBID <- col_subset("CLNDSDBID")
  filter_condition <- input[,unlist(lapply(input, typeof))=="list"] %>% 
    apply(1,function(row) !any(is.na(row)))
  input <- input %>% filter(filter_condition) %>%
    unnest %>% unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>%
    select(VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG, everything()) %>% 
    mutate(CLNSIG = strsplit(CLNSIG,"|",fixed = T)) %>% 
    mutate(CLNDBN = strsplit(CLNDBN,"|",fixed = T)) %>% 
    mutate(POS = as.integer(POS))
  if (has.clnrevstat)
    input <- input %>% mutate(CLNREVSTAT = strsplit(CLNREVSTAT,"|",fixed = T))
  if (has.clndsdbid)
    input <- input %>% mutate(CLNDSDBID = strsplit(CLNDSDBID,"|",fixed = T)) 
  input$CLNSIG <- sapply(input$CLNSIG, function(x) as.integer(x))
  input$INTERP <- sapply(input$CLNSIG, function(x) any(x %in% c(4,5)) ) 
  input
}
clinvar <- get_clinvar(clinvar_file)
#clinvar[duplicated(clinvar$VAR_ID),1:8]
clinvar <- clinvar[!duplicated(clinvar$VAR_ID),]
cat(sprintf("Processed ClinVar data frame %s x %s (selected rows/columns):", nrow(clinvar), ncol(clinvar)))
## Processed ClinVar data frame 204730 x 15 (selected rows/columns):
clinvar[5:8,] %>% select(-CLNALLE, -INFO, -QUAL, -FILTER) %>% remove_rownames %>% pander
VAR_ID CHROM POS ID REF ALT CLNSIG
1_957568_A_G 1 957568 rs115704555 A G 2
1_957605_G_A 1 957605 rs756623659 G A 5
1_957640_C_T 1 957640 rs6657048 C T 255
1_957693_A_T 1 957693 rs879253787 A T 5

Table continues below

CLNDBN CLNREVSTAT CLNDSDBID INTERP
not_specified single CN169374 FALSE
Congenital_myasthenic_syndrome no_criteria C0751882:ORPHA590 TRUE
not_specified conf CN169374 FALSE
Congenital_myasthenic_syndrome no_criteria C0751882:ORPHA590 TRUE
unlink(clinvar_file)


1.3 Download 1000 Genomes VCFs

ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.[chrom].phase3_[version].20130502.genotypes.vcf.gz


Downloaded 1000 Genomes VCFs are saved in: /Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/1000G/

download_1000g <- function(gene, download) {
  #for tracking: #gene %>% paste(which(ACMG.panel==gene)) %>% paste(length(ACMG.panel), sep = "/") %>% print
  success <- FALSE
  refGene <- sprintf("select * from refGene where name2 = \"%s\" limit 20", gene) %>% query
  UCSC <- select(refGene, name, chrom, start = txStart, end = txEnd)
  if (nrow(UCSC) == 0) { #No hit on refGene
    return(rep("NOT_FOUND",5) %>% setNames(c("name","chrom","start","end","downloaded")))
  } else {
    if (nrow(UCSC) > 1) #Multiple hits: take the widest range
      UCSC <- UCSC[which.max(UCSC$end-UCSC$start),]
    if (download) {
    # gets [n] from chr[n]
    chrom.num <- strsplit(UCSC$chrom, split = "chr")[[1]][2]
    # different version for chromosomes X and Y
    version <- switch(chrom.num, "X" = "shapeit2_mvncall_integrated_v1b",
                      "Y" = "integrated_v2a", "shapeit2_mvncall_integrated_v5a")
    command <- paste("tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.%s.",
                     "phase3_%s.20130502.genotypes.vcf.gz %s:%s-%s > %s_genotypes.vcf", sep = "")
    sprintf(command, UCSC$chrom, version, chrom.num, UCSC$start, UCSC$end, gene) %>% system
    Sys.sleep(2)
    # Checks whether the file exists and has non-zero size
    exists <- grepl(paste(gene,"_genotypes.vcf",sep =""), system("ls", intern = T)) %>% sum > 0
    file.size <- strsplit(paste("stat ","_genotypes.vcf", sep = gene) %>% 
                            system(intern = T), " ")[[1]][8]
    success <- exists & file.size > 0
    }
  }
  return(c(UCSC,"downloaded" = success))
}

if (!skip_download & !skip_processing) {
  system("mkdir 1000G")
  setwd(paste(getwd(), "1000G", sep = "/"))
  for (con in dbListConnections(MySQL())) dbDisconnect(con)
  con <- dbConnect(MySQL(), user = 'genome',
                   dbname = 'hg19', host = 'genome-mysql.cse.ucsc.edu',
                   unix.sock = "/Applications/MAMP/tmp/mysql/mysql.sock")
  query <- function (input) { suppressWarnings(dbGetQuery(con, input)) }
  download_output <- sapply(ACMG.panel, function(gene) download_1000g(gene, download = T)) %>% t
  print(download_output, quote = F)
  download_output <- download_output %>% 
    apply(2, unlist) %>% 
    as.data.frame(stringsAsFactors = F) %>% 
    mutate("gene" = rownames(download_output)) %>% 
    select(gene, everything()) %>% 
    filter(downloaded != "NOT_FOUND")
  download_output <- download_output %>%
    mutate(chrom = sapply(strsplit(download_output$chrom,"chr"), function(x) x[2]), 
           start = as.integer(start), end = as.integer(end), 
           downloaded = as.logical(downloaded))
  write.table(download_output, file = "download_output.txt", 
              row.names = F, col.names = T, quote = F, sep = "\t")
  system("rm *.genotypes.vcf.gz.tbi")
  setwd("../")
} else {
  if (skip_download | skip_processing) {
    download_output <- read.table("Supplementary_Files/download_output.txt", header = T, stringsAsFactors = F)
  } else {
    download_output <- read.table("1000G/download_output.txt", header = T, stringsAsFactors = F)
  }
}
cat(sprintf("Download report: region and successes: %s x %s (selected rows):", nrow(download_output), ncol(download_output)))
## Download report: region and successes: 59 x 6 (selected rows):
download_output[1:5,] %>% format(scientific = F) %>% pander
gene name chrom start end downloaded
BRCA1 NM_007294 17 41196311 41277500 TRUE
BRCA2 NM_000059 13 32889616 32973809 TRUE
TP53 NM_000546 17 7571719 7590868 TRUE
STK11 NM_000455 19 1205797 1228434 TRUE
MLH1 NM_000249 3 37034840 37092337 TRUE
cat("File saved as download_output.txt in Supplementary_Files")
## File saved as download_output.txt in Supplementary_Files


\newpage

1.4 Import and Process 1000 Genomes VCFs

  1. Unnest the data frames to 1 row per variant_ID key (CHROM_POSITION_REF_ALT).
  2. Remove all insertions, deletions, CNV, etc, and keep only missense variants (1 REF, 1 ALT)
  3. For 1000 Genomes: convert genomes to allele counts. For example: (0|1) becomes 1, (1|1) becomes 2.
    Multiple alleles are unnested into multiple counts. For example: (0|2) becomes 0 for the first allele (no 1s) and 1 for the second allele (one 2).
import.file.1000g <- function(gene) {
  #for tracking: 
  sprintf("%s [%s/%s]", gene, grep(gene, ACMG.panel), length(ACMG.panel)) %>% print(quote = F)
  name <- paste("1000G",paste(gene,"genotypes.vcf", sep = "_"), sep = "/")
  output <- read.table(paste(getwd(),name,sep="/"), stringsAsFactors = FALSE)
  #Add header
  names(output)[1:length(header)] <- header
  #Remove all single alt indels
  output <- output[nchar(output$REF)==1,] #deletions
  alt_num <- sapply(strsplit(output$ALT,","),length) #number of alts
  acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
  output <- output[nchar(output$ALT)==acceptable_nchar,] #insertions
  alt_num <- sapply(strsplit(output$ALT,","),length) #recalculate
  paired = which(alt_num!=1) #all with ,
  #Add AF Column
  af <- strsplit(output$INFO,";") %>% sapply("[", 2) %>% 
    strsplit("AF=") %>% sapply("[", 2) %>% strsplit(",") %>% sapply(as.numeric)
  output <- cbind(GENE = gene, "AF_1000G"=I(af), output) #Places it at the front of output
  front_cols <- 1:(grep("HG00096",colnames(output))-1)
  if (length(paired)!=0) {
    #Limit max vector length by sapply(strsplit(output$ALT,","),length)
    sapply(paired, function(rownum) { #For every row
      sapply(as.character(1:alt_num[rownum]), function(num) {
        grepl(paste(num,"|",sep = ""), output[rownum,-front_cols], fixed=T) +
        grepl(paste("|",num,sep = ""), output[rownum,-front_cols], fixed=T)
      }) %>% t -> temp
      split(temp, rep(1:ncol(temp), each = nrow(temp))) %>% setNames(NULL) 
      #Separate into list of vectors (1 entry for counting each ALT)
    }) %>% t -> insert
    insert <- cbind(output[paired,front_cols],insert)
    colnames(insert) <- colnames(output)
    insert <- insert %>% #adds front_col info
      mutate(ALT = strsplit(ALT,",")) %>% #Splits ALTS
      unnest() %>% #Unnests everything
      select(GENE, AF_1000G, CHROM, POS, ID, REF, ALT, everything()) #Reorders everything
    output <- output[-paired,] #Removes paired
  }
  output <- cbind(output[,front_cols],
                  apply(output[,-front_cols], 2, function(y) {
                    grepl("1|", y, fixed=T) +
                    grepl("|1", y, fixed=T)
                  }) ) #convert to logical
  if (length(paired)!=0)
    output <- rbind(output, insert) #joins the two
  output$AF_1000G <- as.numeric(output$AF_1000G)
  unite(output, VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
  #Make VAR_ID, arrange by VAR_ID
}

if (skip_processing) {
  #saveRDS(ACMG.1000g, file = "ACMG_Penetrance/ACMG_1000G.rds")
  ACMG.1000g <- readRDS(file = "ACMG_Penetrance/ACMG_1000G.rds")
} else {
  # Import 1000G data for all ACMG
  ACMG.1000g <- NULL
  header <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", as.character(map$sample))
  for (gene in ACMG.panel) {
    #print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
    ACMG.1000g <- rbind(ACMG.1000g,import.file.1000g(gene))
  }
  #Display and remove duplicates
  #ACMG.1000g[duplicated(ACMG.1000g$VAR_ID),1:8]
  ACMG.1000g <- ACMG.1000g[!duplicated(ACMG.1000g$VAR_ID),]
}
#write.csv(ACMG.1000g, file = "ACMG.1000G.csv", row.names = F, quote = F)
#save(ACMG.1000g, file = "ACMG.1000G")
cat(sprintf("Processed 1000 Genomes VCFs: %s x %s (selected rows/columns):", nrow(ACMG.1000g), ncol(ACMG.1000g)))
## Processed 1000 Genomes VCFs: 141467 x 2516 (selected rows/columns):
ACMG.1000g[1:5,1:18] %>% select(-INFO, -QUAL, -FILTER, -FORMAT) %>% 
  format(scientific = F) %>% pander
GENE AF_1000G VAR_ID CHROM POS ID REF ALT
BRCA1 0.004193290 17_41196363_C_T 17 41196363 rs8176320 C T
BRCA1 0.008386580 17_41196368_C_T 17 41196368 rs184237074 C T
BRCA1 0.000998403 17_41196372_T_C 17 41196372 rs189382442 T C
BRCA1 0.342252000 17_41196408_G_A 17 41196408 rs12516 G A
BRCA1 0.000399361 17_41196409_G_C 17 41196409 rs548275991 G C

Table continues below

HG00096 HG00097 HG00099 HG00100 HG00101 HG00102
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
1 0 1 1 0 2
0 0 0 0 0 0
if (rm_rs1805124)
  ACMG.1000g <- ACMG.1000g[ACMG.1000g$ID!="rs1805124",]


1.5 Import and Process gnomAD/ExAC VCFs

  1. Unnest the data frames to 1 row per variant_ID key (CHROM_POSITION_REF_ALT).
  2. Remove all insertions, deletions, CNV, etc, and keep only missense variants (1 REF, 1 ALT)
  3. Collect superpopulation-level allele frequencies:
    African = AFR, Latino = AMR, European (Finnish + Non-Finnish) = EUR, East.Asian = EAS, South.Asian = SAS.
import.file.exac <- function(gene, dataset) {
  file_name <- sprintf("%s/%s_%s.csv", dataset, dataset, gene)
  output <- read.csv(file_name, stringsAsFactors = FALSE)
  output$Number.of.Hemizygotes <- NULL #Inconsistently present column; removal allows row aggregation
  # Correcting for some alternate naming conventions
  if ("Conseq." %in% colnames(output))
    output <- output %>% rename(Consequence = Conseq.)
  if ("Count" %in% colnames(output))
    output <- output %>% rename(Allele.Count = Count)
  # Imputing missing South Asian values for NF2
  if (!("Allele.Count.South.Asian" %in% colnames(output))) {
    output$Allele.Number.South.Asian <- (2*output$Allele.Number) -
      (output %>% select(contains("Allele.Number")) %>% rowSums)
    output$Allele.Count.South.Asian <- (2*output$Allele.Count) - 
      (output %>% select(contains("Allele.Count")) %>% rowSums)
    output$Homozygote.Count.South.Asian <- (2*output$Number.of.Homozygotes) - 
      (output %>% select(contains("Homozygote")) %>% rowSums)
  }
  output <- cbind(GENE = gene, output[nchar(paste(output$Alternate,output$Reference))==3,]) %>% 
    select(GENE, AF_EXAC = contains("Freq"), CHROM=Chrom, POS=Position, 
           ID=RSID, REF=Reference, ALT=Alternate, Annotation = contains("Annot"), everything()) %>% 
    unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
  tags <- list("African","Latino","East.Asian","European","South.Asian")
  european <- output %>% select(contains("Finnish"), contains("European"))
  if (dataset == "gnomad") {
    european <- output %>% select(contains("Finnish"), contains("European"), contains("Jewish"))
    output <- output %>% select(GENE, AF_GNOMAD = AF_EXAC, everything())
  }
  output$Allele.Count.European <- european %>% select(contains("Allele.Count")) %>% rowSums
  output$Allele.Number.European <- european %>% select(contains("Allele.Number")) %>% rowSums
  exac_af <- output[,sprintf("Allele.Count.%s", tags)] / output[,sprintf("Allele.Number.%s", tags)]
  colnames(exac_af) <- sprintf("AF_%s_%s", toupper(dataset), c("AFR","AMR","EAS","EUR","SAS"))
  output <- cbind(output, exac_af) %>% 
    select(GENE, contains(toupper(dataset)), everything())
  if (rm_rs1805124)
    output <- output[output$ID!="rs1805124",]
  return(output)
}

# Import ExAC data for all ACMG
ACMG.exac <- NULL
ACMG.gnomad <- NULL
for (gene in ACMG.panel) {
  #print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
  ACMG.exac <- rbind(ACMG.exac,import.file.exac(gene, "exac"))
  ACMG.gnomad <- rbind(ACMG.gnomad,import.file.exac(gene, "gnomad"))
}
#Display and remove duplicates
#ACMG.exac[duplicated(ACMG.exac$VAR_ID),1:8]
#ACMG.gnomad[duplicated(ACMG.gnomad$VAR_ID),1:8]
ACMG.exac <- ACMG.exac[!duplicated(ACMG.exac$VAR_ID),]
ACMG.gnomad <- ACMG.gnomad[!duplicated(ACMG.gnomad$VAR_ID),]
#write.csv(ACMG.exac, file = "ACMG.ExAC.csv", row.names = F, quote = F)
#write.csv(ACMG.gnomad, file = "ACMG.gnomAD.csv", row.names = F, quote = F)

cat(sprintf("Processed gnomAD VCFs: %s x %s (selected rows/columns):", nrow(ACMG.gnomad), ncol(ACMG.gnomad)))
## Processed gnomAD VCFs: 96742 x 48 (selected rows/columns):
ACMG.gnomad[sample(nrow(ACMG.gnomad) %>% sort,5),c(1,2,8)] %>% format(scientific = F) %>% pander
  GENE AF_GNOMAD VAR_ID
63619 PKP2 0.00000797 12_32976966_T_G
35370 COL3A1 0.00000397 2_189868121_C_T
46660 MYH11 0.02760000 16_15879812_C_T
43762 MYH11 0.00020300 16_15808945_C_T
1501 BRCA1 0.00003310 17_41256214_A_C
cat(sprintf("Processed ExAC VCFs: %s x %s (selected rows/columns):", nrow(ACMG.exac), ncol(ACMG.exac)))
## Processed ExAC VCFs: 59883 x 45 (selected rows/columns):
ACMG.exac[sample(nrow(ACMG.exac),5) %>% sort,c(1,2,8)] %>% format(scientific = F) %>% pander
  GENE AF_EXAC VAR_ID
22854 COL3A1 0.000046190 2_189860521_A_G
23888 FBN1 0.000008245 15_48717982_A_G
32401 TNNT2 0.000017070 1_201334784_T_C
37501 RYR2 0.000008288 1_237789028_A_T
45383 KCNH2 0.000009720 7_150648963_A_G


\newpage

1.6 Collect 1000 Genomes Phase 3 Populations Map

This will allow us to assign genotypes from the 1000 Genomes VCF to ancestral groups.
From: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

#read the map and delete the file
map <- read.table(file = "Supplementary_Files/phase3map.txt", stringsAsFactors = F, header = T) %>% as.data.frame
#display
cat("Phase 3 Populations Map Table: 2504 x 4 (selected rows)")
## Phase 3 Populations Map Table: 2504 x 4 (selected rows)
map[sample(nrow(map),6),] %>% arrange(super_pop) %>% remove_rownames %>% pander
sample pop super_pop gender
NA19213 YRI AFR male
NA19755 MXL AMR female
HG01375 CLM AMR female
HG01259 CLM AMR male
NA19081 JPT EAS female
HG03234 PJL SAS male
#Make list of populations and superpopulations for later plotting
pop.table <- map[!duplicated(map$pop),] %>% 
  select(contains("pop")) %>% arrange(super_pop, pop)
super <- pop.table$super_pop %>% setNames(pop.table$pop)
super.levels <- unique(pop.table$super_pop)
pop.levels <- unique(pop.table$pop)
#Plot distribution of ancestral backgrounds
#Population = factor(as.character(map$pop), levels = pop.levels)
#cat("Population Distribution")
#ggplot(map, aes(map$super_pop, fill = Population)) + 
#  geom_bar(color = 'black', width = 0.5) + 
#  ylab ("No. of Individuals") + xlab ("Superpopulation") + 
#  ggtitle("1000 Genomes - Samples by Population")
#rm(Population)
if (!("AF_1000G_AFR" %in% colnames(ACMG.1000g))) {
  front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
  sapply(super.levels, function(superpop){
    keep <- map$super_pop == superpop
    (ACMG.1000g[,length(front_cols)+which(keep)] %>% rowSums)/(2*sum(keep))
  }) -> pop_af
  colnames(pop_af) <- sprintf("AF_1000G_%s",super.levels)
  ACMG.1000g <- data.frame(ACMG.1000g, pop_af) %>% 
    select(GENE, AF_1000G, VAR_ID, CHROM, POS, ID, REF, ALT, 
           AF_1000G_AFR, AF_1000G_AMR, AF_1000G_EAS, AF_1000G_EUR, AF_1000G_SAS, everything())
  rm(pop_af)
}


1.7 Merge ClinVar with gnomAD, ExAC, and 1000 Genomes

merge_clinvar_1000g <- function() {
  inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.1000g$VAR_ID)
  clinvar_merged <- clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID)
  ACMG_merged <- ACMG.1000g[ACMG.1000g$VAR_ID %in% inter,] %>% arrange(VAR_ID)
  front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
  cbind(ACMG_merged[,c("GENE",sprintf("AF_1000G%s", c("",paste0("_",super.levels))))], 
        clinvar_merged,ACMG_merged[,-front_cols])
}
merged_1000g <- merge_clinvar_1000g()

inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.exac$VAR_ID)
merged_exac <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID), 
  ACMG.exac %>% select(VAR_ID, contains("AF_"), GENE) %>% 
    filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
  ) %>% select(VAR_ID, GENE, AF_EXAC, contains("AF_"), everything())

inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.gnomad$VAR_ID)
merged_gnomad <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID), 
  ACMG.gnomad %>% select(VAR_ID, contains("AF_"), GENE) %>% 
    filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
  ) %>% select(VAR_ID, GENE, AF_GNOMAD, contains("AF_"), everything())

#count up all pathogenic ClinVar in ACMG regions
is.acmg <- function(row) {
  genes <- which(row$CHROM == download_output$chrom)
  sapply(genes, function(gene) {
    between(row$POS, download_output[gene,]$start, download_output[gene,]$end)
  }) %>% any
}
cat("Breakdown of ClinVar Variants")
## Breakdown of ClinVar Variants
data.frame(Subset_ClinVar = c("Total ClinVar","LP/P","ACMG LP/P",
  "ACMG LP/P in gnomAD", "ACMG LP/P in ExAC","ACMG LP/P in 1000 Genomes"),
   Number_of_Variants = c(nrow(clinvar), 
                          sum(clinvar$INTERP), 
                          sum(apply(clinvar[clinvar$INTERP,], 1, is.acmg)), 
                          nrow(merged_gnomad),
                          nrow(merged_exac), 
                          nrow(merged_1000g))) %>% pander
Subset_ClinVar Number_of_Variants
Total ClinVar 204730
LP/P 34152
ACMG LP/P 6781
ACMG LP/P in gnomAD 1179
ACMG LP/P in ExAC 845
ACMG LP/P in 1000 Genomes 135
breakdown <- function(dataset) {
  dataset_name <- ifelse(dataset == "1000G", dataset, tolower(dataset))
  ACMG.data <- parse(text=sprintf("ACMG.%s", tolower(dataset))) %>% eval
  sprintf("Breakdown of ACMG-%s Variants", dataset) %>% cat
  data.frame(Subset_gnomAD = sprintf(c("ACMG in %s", "ClinVar-ACMG in %s", "LP/P-ACMG in %s"), dataset), 
  Number_of_Variants = c(nrow(ACMG.data),
    intersect(clinvar$VAR_ID, ACMG.data$VAR_ID) %>% length, 
    parse(text=sprintf("merged_%s", tolower(dataset))) %>% eval %>% nrow
    )
  ) %>% pander
}
breakdown("gnomAD")
## Breakdown of ACMG-gnomAD Variants
Subset_gnomAD Number_of_Variants
ACMG in gnomAD 96742
ClinVar-ACMG in gnomAD 13897
LP/P-ACMG in gnomAD 1179
breakdown("ExAC")
## Breakdown of ACMG-ExAC Variants
Subset_gnomAD Number_of_Variants
ACMG in ExAC 59883
ClinVar-ACMG in ExAC 10778
LP/P-ACMG in ExAC 845
breakdown("1000G")
## Breakdown of ACMG-1000G Variants
Subset_gnomAD Number_of_Variants
ACMG in 1000G 141466
ClinVar-ACMG in 1000G 6012
LP/P-ACMG in 1000G 135


\newpage

1.8 Comparison with ClinVar Browser Query Results

clinvar_query.txt contains all results matched by the search query: “(APC[GENE] OR MYH11[GENE]… OR WT1[GENE]) AND (clinsig_pathogenic[prop] OR clinsig_likely_pathogenic[prop])” from the ClinVar website. The exact query is saved in /Supplementary_Files/query_input.txt


This presents another way of collecting data from ClinVar.

Intermediate step: convert hg38 locations to hg19 using the Batch Coordinate Conversion tool (liftOver) from UCSC Genome Browser Utilities.

clinvar_query <- read.table(file = "Supplementary_Files/clinvar_result_2016_12_04.txt", sep = "\t", header = F, skip = 1, stringsAsFactors = F)
colnames(clinvar_query) <- c("Name", "Gene(s)", "Condition(s)", "Frequency", "Clinical significance (Last reviewed)","Review status", "Chromosome","Location","Assembly","VariationID","AlleleID(s)")
clinvar_count <- NULL
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #1
clinvar_query <- clinvar_query[grepl(".>.",clinvar_query$Name),]
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #2
clinvar_query <- clinvar_query[!grepl(" - ", clinvar_query$Location) & 
                    !grepl("|",clinvar_query$Chromosome, fixed = T) &
                    !(clinvar_query$Location == "") & 
                    !grepl("Conflicting", clinvar_query$`Clinical significance (Last reviewed)`),]
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #3
clinvar_query <- clinvar_query %>%
  mutate(Name = sub(".*(.>.).*","\\1", clinvar_query$Name)) %>% 
  mutate(Location = as.integer(Location)) %>%
  separate(Name, into = c("Alternate","Reference"), sep = ">")
#liftOver from http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/liftOverMerge
#input data from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/
#paste(paste0("chr",clinvar_query$Chromosome), clinvar_query$Location, clinvar_query$Location+1) %>%
#  write.table(file = "Supplementary_Files/cvquery_hg38_2016_12_04.bed", quote = F, row.names = F, col.names = F)
#system("chmod +x ../Tools/liftOver")
#system("../Tools/liftOver Supplementary_Files/cvquery_hg38_2016_12_04.bed ../Tools/hg38ToHg19.over.chain.gz Supplementary_Files/cvquery_hg19_2016_12_04.bed err.log")
cvquery_hg19 <- read.table(file = "Supplementary_Files/cvquery_hg19_2016_12_04.bed", sep = "\t", header = F, stringsAsFactors = F)
clinvar_query <- clinvar_query %>%
  mutate(Location = cvquery_hg19$V2) %>%
  unite(col = "VAR_ID", Chromosome, Location, Reference, Alternate, sep = "_", remove = F)
clinvar_count <- clinvar_count %>% c(sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID), #4
                    sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP]), #5
                    sum(clinvar_query$VAR_ID %in% merged_gnomad$VAR_ID), #6
                    sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID), #7
                    sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)) #8
clinvar_query[(clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID),] -> temp
cat(sprintf("ClinVar Query Results Table (substitutions only): %s x %s (selected rows/columns)", 
            nrow(clinvar_query), ncol(clinvar_query)))
## ClinVar Query Results Table (substitutions only): 6445 x 13 (selected rows/columns)
clinvar_query %>% filter(clinvar_query$VAR_ID %in% merged_gnomad$VAR_ID) %>% 
  slice(1:5) %>% select(c(1,4:8)) %>%
  mutate(`Condition(s)` = `Condition(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1)) %>%
  mutate(Frequency = Frequency %>% strsplit(", ") %>% sapply("[",1) ) %>% 
  mutate(`Gene(s)` = `Gene(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1) ) %>% pander
VAR_ID Gene(s) Condition(s) Frequency
1_17350520_G_C SDHB Paragangliomas 4 NA
1_45798631_T_A MUTYH Hereditary cancer-predisposing syndrome NA
1_201334766_A_T TNNT2 Familial hypertrophic cardiomyopathy 2 NA
3_38646328_G_C SCN5A Atrial fibrillation, familial, 10 NA
3_38647447_G_C SCN5A Atrial fibrillation, familial, 10 NA

Table continues below

Clinical significance (Last reviewed) Review status
Pathogenic/Likely pathogenic, not provided(Last reviewed: Feb 2, 2015) criteria provided, single submitter
Pathogenic(Last reviewed: Sep 17, 2015) criteria provided, single submitter
Pathogenic/Likely pathogenic(Last reviewed: Feb 27, 2016) criteria provided, multiple submitters, no conflicts
Pathogenic(Last reviewed: Apr 15, 2008) no assertion criteria provided
Pathogenic(Last reviewed: Apr 15, 2008) no assertion criteria provided
cat("Breakdown of ClinVar Query Results Table: ")
## Breakdown of ClinVar Query Results Table:
data.frame(Subset = c("Initial Count","Filter Substitutions (N>N')","Filter Coupling/Bad-Locations","In ClinVar VCF","In LP/P-ClinVar","In LP/P-ACMG & gnomAD","In LP/P-ACMG & ExAC","In LP/P-ACMG & 1000G"), Number_of_Variants = clinvar_count) %>% pander
Subset Number_of_Variants
Initial Count 14097
Filter Substitutions (N>N’) 7039
Filter Coupling/Bad-Locations 6445
In ClinVar VCF 494
In LP/P-ClinVar 493
In LP/P-ACMG & gnomAD 45
In LP/P-ACMG & ExAC 33
In LP/P-ACMG & 1000G 1
cat("Note the large reduction after merging the online query results with the VCF.")
## Note the large reduction after merging the online query results with the VCF.



\newpage

2 Plot Summary Statistics Across Populations

### For plotting population level data:
prettyprint <- function(values, sd, title, xlabel, ylabel, ylimit) {
  if (missing(sd)) sd <- TRUE
  if (missing(title)) title <- NULL
  if (missing(xlabel)) xlabel <- "Population"
  if (missing(ylabel)) ylabel <- NULL
  if (missing(ylimit)) ylimit <- NULL
  colnames(values) <- c("Mean","SD")
  values$Population <- factor(pop.levels, levels = pop.levels)
  values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
  
  plot.pop <- ggplot(values, aes(x=Population, y=Mean, fill = Superpopulation)) +
    geom_bar(stat = "identity") + ggtitle(title) + xlab(xlabel) + ylab(ylabel) +
    theme_minimal() + theme(axis.text.x = element_text(angle = -45, hjust = 0.4))
  if (sd) {
    if (min(values$Mean - values$SD)<0)
      plot.pop <- plot.pop + geom_errorbar(aes(
        ymin = pmax(0,values$Mean - values$SD), 
        ymax = values$Mean + values$SD, width = 0.5))
    else 
      plot.pop <- plot.pop + geom_errorbar(aes(ymin = values$Mean - values$SD, 
                                               ymax = values$Mean + values$SD, width = 0.5))
  } else {values$SD = 0}
  if (length(ylimit)==2)
    plot.pop <- plot.pop + ylim(ylimit[1],ylimit[2])
  else
    plot.pop <- plot.pop + ylim(0, 1.1*max(values$Mean + values$SD))
  plot.pop
}


2.1 Distribution of Allele Frequencies


plot_af_distrib <- function() {
  af_distrib <- data.frame(Index = 2:max(nrow(merged_1000g),nrow(merged_exac))-1,
    AF_1000G = sort(merged_1000g$AF_1000G[merged_1000g$AF_1000G != max(merged_1000g$AF_1000G)],
                    decreasing = T) %>% c(rep(NA,nrow(merged_exac)-nrow(merged_1000g))), 
    AF_EXAC = sort(merged_exac$AF_EXAC[merged_exac$AF_EXAC != max(merged_exac$AF_EXAC)], 
                   decreasing = T)) %>%
    gather(Dataset, Allele_Frequency, AF_1000G, AF_EXAC) %>%
    filter(!is.na(Allele_Frequency))
  ggplot(aes(x = Allele_Frequency, color = Dataset), data = af_distrib) + 
    geom_density() + facet_grid(Dataset ~ .) + xlab("Allele Frequency") + ylab("Density") +
    theme(legend.position="none")
}
#plot_af_distrib()

data.frame(
  Spectrum_1000_Genomes = table(round(ACMG.1000g$AF_1000G*5008))[as.character(1:20)] %>% as.vector,
  Spectrum_ExAC = table(ACMG.exac$Allele.Count)[as.character(1:20)] %>% as.vector,
  Spectrum_gnomAD = table(ACMG.gnomad$Allele.Count)[as.character(1:20)] %>% as.vector
) %>% gather(Dataset, Frequency) %>%
ggplot(aes(x = c(1:20,1:20,1:20), y = Frequency)) + 
  geom_bar(aes(fill = Dataset), position = "dodge", stat = "identity") + 
  xlab("Allele Counts") + ggtitle("Allele Count Spectrum (1-20)") + 
  theme(legend.position="bottom")

#hist(round(ACMG.1000g$AF_1000G*5008) %>% table -> temp, breaks = 300000, xlim = c(0,20), 
#     main = "Allele Frequency Spectrum", xlab = "Allele Count")
#hist(table(ACMG.exac$Allele.Count)+0.2, breaks = 120000, xlim = c(0,20), ylim = c(0,1100), 
#     add = T, col = 'red')

#Test of Poissonness
x = table(merged_1000g$AF_1000G)
k = 1:length(x)-1
poissondata = data.frame(k=k, poissonness = as.vector(log(x)+lfactorial(k)))

The distribution of allele frequencies is approximately Poisson, with “Poissonness plot” correlation = 0.99. The Poissonness plot (Hoaglin 1980) is defined as the plot of \(log(x_k) + log(k!)\) vs. \(k\), as shown below:

ggplot(aes(x=k,y=poissonness), data = poissondata) + geom_point() + ylab("log(x_k) + log(k!)") + ggtitle("Poissonness Plot")

\newpage
var_plot_1000g <- function(pathogenic, frac) {
  if (pathogenic){
    KP <- sapply(merged_1000g$CLNSIG, function(x) 5 %in% x) 
    KP_only <- c("RET","PRKAG2","MYH7","TNNI3","TPM1","MYL3","CACNA1S",
      "DSP","MYL2","APOB","PCSK9","RYR1","RYR2","SDHAF2","ACTC1")
    ACMG.data <- merged_1000g[!((merged_1000g$GENE %in% KP_only) & (!KP)),]
  } else {
    ACMG.data <- ACMG.1000g
  }
  front_cols <- 1:(grep("HG00096",colnames(ACMG.data))-1)
  recessive <- ACMG.data$GENE %in% c("MUTYH","ATP7B")
  sapply(pop.levels, function(pop) {
    keep <- c(front_cols,map$pop)==pop
    if (frac) {
      temp <- (colSums(ACMG.data[!recessive,keep])>0) + 
              (colSums(ACMG.data[ recessive,keep])>1)
    } else {
      #Counts the number of non-reference sites in a gene
      temp <- colSums(ACMG.data[,keep]>0)
    }
    c(mean(temp), sd(temp))
  }) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
  colnames(values) <- c("Mean","SD")
  values$Population <- factor(pop.levels, levels = pop.levels)
  values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
  title <- sprintf("ACMG-59%s: %s in 1000 Genomes", 
              ifelse(pathogenic, " Pathogenic",""), ifelse(frac,"Fraction","Mean"))
  prettyprint(values, title = title, sd = F, ylimit = NULL, 
              xlabel = "Population", 
              ylabel = ifelse(frac, "Fraction with at least 1 non-reference site", 
                              "Mean No. of Non-Reference Sites")
  )
}

var_plot_exac <- function(dataset, pathogenic, frac) {
  ACMG.data <- parse(text=paste0(ifelse(pathogenic, "merged_", "ACMG."), tolower(dataset))) %>% eval
  exac_prob <- 1-(1-ACMG.data[,sprintf("AF_%s_%s", toupper(dataset), super.levels)])^2
  if (frac) {
    exac_values <- data.frame(1-apply(1-exac_prob, 2, prod, na.rm = T), super.levels)
  } else {
    exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
  }
  colnames(exac_values) = c("values","Superpopulation")
  ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    ggtitle(sprintf("ACMG-59%s: %s in %s", 
              ifelse(pathogenic, " Pathogenic",""), ifelse(frac,"Fraction","Mean"), dataset)) + 
    xlab("Population") + ylab("Mean No. of Non-Reference Sites") + 
    ylim(0,1.1*max(exac_values$values))
}

2.2 Overall Non-Reference Sites

2.2.0.1 For 1000 Genomes

Each individual has \(n\) non-reference sites, which can be found by counting. The mean number is computed for each population.

Ex: the genotype of 3 variants in 3 people looks like this:

example <- ACMG.1000g[3142:3144,sprintf("HG00%d",366:368)]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
  HG00366 HG00367 HG00368
Variant 1 2 1 1
Variant 2 2 1 1
Variant 3 1 0 0

Count the number of non-reference sites per individual:

colSums(example>0) %>% pander
HG00366 HG00367 HG00368
3 2 2
cat(sprintf("Mean = %s", mean(colSums(example>0)) %>% signif(3)))
## Mean = 2.33
var_plot_1000g(pathogenic = F, frac = F)


Note: the error bars denote standard deviation, not standard error.

\newpage

2.2.0.2 For gnomAD/ExAC

The mean number of non-reference sites is \(E(V)\), where \(V = \sum_{i=1}^n v_i\) is the number of non-reference sites at all variant positions \(v_1\) through \(v_n\).

At each variant site, the probability of having at least 1 non-reference allele is \(P(v_i) = P(v_{i,a} \cup v_{i,b})\), where \(a\) and \(b\) indicate the 1st and 2nd allele at each site.

If the two alleles are independent, \(P(v_{i,a} \cup v_{i,b})\) = \(1-(1-P(v_{i,a}))(1-P(v_{i,b})) = 1-(1-AF(v_i))^2\)

If all variants are independent, \(E(V) = \sum_{i=1}^n 1-(1-AF(v_i))^2\) for any set of allele frequencies.


Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:

example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.1 0.2 0 0 0.3
Variant 2 0.2 0 0.3 0 0.1

The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\). Note that this is approximately \(2*AF\) when \(AF\) is small:

as.data.frame(1-(1-example)^2) %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.19 0.36 0 0 0.51
Variant 2 0.36 0 0.51 0 0.19

By linearity of expectation, the expected (mean) number of non-reference sites is \(\sum E(V_i) = \sum (columns)\).

colSums(1-(1-example)^2) %>% pander
AFR AMR EAS EUR SAS
0.55 0.36 0.51 0 0.7
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
var_plot_exac("gnomAD", pathogenic = F, frac = F)

#var_plot_exac("ExAC", pathogenic = F, frac = F)


\newpage

2.3 Pathogenic Non-Reference Sites

2.3.0.1 For 1000 Genomes and gnomAD/ExAC

This is the same procedure as above, but performed only on the subset of variants that are pathogenic.

var_plot_1000g(pathogenic = T, frac = F)

var_plot_exac("gnomAD", pathogenic = T, frac = F)

#var_plot_exac("ExAC", pathogenic = T, frac = F)


\newpage

2.4 Fraction of Individuals with Pathogenic Sites

2.4.0.1 For 1000 Genomes

We can count up the fraction of individuals with 1+ non-reference site(s) in each population. This is the fraction of individuals who would receive a positive genetic test result in at least 1 of the ACMG-59 genes.

Ex: the genotype of 3 variants in 3 people looks like this:

example <- ACMG.1000g[3142:3144,sprintf("HG00%d",366:368)]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
  HG00366 HG00367 HG00368
Variant 1 2 1 1
Variant 2 2 1 1
Variant 3 1 0 0

Count each individual as having a non-reference site (1) or having only reference sites (0):

(1*(colSums(example>0)>0)) %>% pander
HG00366 HG00367 HG00368
1 1 1
cat(sprintf("Mean = %s", mean(1*(colSums(example>0)>0)) %>% signif(3)))
## Mean = 1
var_plot_1000g(pathogenic = T, frac = T)

\newpage

2.4.0.2 For gnomAD/ExAC

The probability of having at least 1 non-reference site is \(P(X)\), where \(X\) indicates a non-reference site at any variant position \(v_1\) through \(v_n\).

Recall that \(P(v_i) = P(v_{i,a} \cup v_{i,b}) = 1-(1-AF(v))^2\) when alleles are independent.

If all alleles are independent, \(P(X) = P(\bigcup_{i=1}^n v_i) = 1-\prod_{i=1}^n (1-AF(v_i))^2\)

Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:

example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.1 0.2 0 0 0.3
Variant 2 0.2 0 0.3 0 0.1

The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\).
Note that this is approximately \(2*AF\) when \(AF\) is small:

as.data.frame(1-(1-example)^2) %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.19 0.36 0 0 0.51
Variant 2 0.36 0 0.51 0 0.19

The expected (mean) number of non-reference sites is given by \(1-\prod (1-AF)^2\).

apply(example, 2, function(x) 1-prod((1-x)^2)) %>% pander
AFR AMR EAS EUR SAS
0.4816 0.36 0.51 0 0.6031
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
var_plot_exac("gnomAD", pathogenic = T, frac = T)

#var_plot_exac("ExAC", pathogenic = T, frac = T)


\newpage

2.5 Common Pathogenic Variants by Ancestry

### 1000 Genomes
merged_1000g %>% select(contains("AF_1000G_")) -> af_1000g_by_ancestry
rownames(af_1000g_by_ancestry) <- merged_1000g$VAR_ID
colnames(af_1000g_by_ancestry) <- super.levels
ord <- order(apply(af_1000g_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- row.names(af_1000g_by_ancestry)[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id), 
    af_1000g_by_ancestry[ord,]) %>% gather(Ancestry, Subdivided_Allele_Frequencies, -Var_ID)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
    geom_bar(stat='identity', color = 'black', width = 0.7) + 
    ggtitle("High Frequency Variants in 1000 Genomes") + coord_flip()

### gnomAD
af_gnomad_by_ancestry <- merged_gnomad[,sprintf("AF_GNOMAD_%s",super.levels)]
colnames(af_gnomad_by_ancestry) <- super.levels
ord <- order(apply(af_gnomad_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- merged_gnomad$VAR_ID[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id), 
                         af_gnomad_by_ancestry[ord,]) %>% 
              gather(Ancestry, Subdivided_Allele_Frequencies, 2:6)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
    geom_bar(stat='identity', color = 'black', width = 0.7) + 
    ggtitle("High Frequency Variants in gnomAD") + coord_flip()


3 Penetrance Estimates

3.1 Bayes’ Rule as a Model for Estimating Penetrance

Let \(V_x\) be the event that an individual has 1 or more variant related to disease \(x\),
and \(D_x\) be the event that the individual is later diagnosed with disease \(x\).

In this case, we can define the following probabilities:
1. Prevalence = \(P(D_x)\)
2. Allele Frequency = \(P(V_x)\)
3. Allelic Heterogeneity = \(P(V_x|D_x)\)
4. Penetrance = \(P(D_x|V_x)\)

By Bayes’ Rule, the penetrance of a variant related to disease \(x\) may be defined as: \[P(D_x|V_x) = \frac{P(D_x)*P(V_x|D_x)}{P(V_x)} = \frac{Prevalence*Allelic.Heterogeneity}{Allele.Frequency}\]

To compute penetrance estimates for each of the diseases related to the ACMG-59 genes, we will use the prevalence data we collected into Literature_Prevalence_Estimates.csv, allele frequency data from 1000 Genomes and ExAC, and a broad range of values for allelic heterogeneity.

3.2 Import Literature-Based Disease Prevalence Data

Data Collection: 1. Similar disease subtypes were grouped together (e.g., the 8 different types of familial hypertrophic cardiomyopathy), resulting in 30 disease categories across 59 genes.
2. The search query “[disease name] prevalence” was used to find articles using Google Scholar.
3. Prevalence estimates were recorded along with URL, journal, region, publication year, sample size, first author, population subset (if applicable), date accessed, and potential issues. Preference was given to studies with PubMed IDs, more citations, and larger sample sizes.

Prevalence was recorded as reported: either a point estimate or a range. Values of varying quality were collected across all diseases.

ACMG_Lit_Full <- read.csv(file = "Supplementary_Files/Literature_Prevalence_Estimates.csv", 
              header = TRUE, stringsAsFactors = F, na.strings = "\\N") 
ACMG_Lit <- ACMG_Lit_Full %>% filter(Evaluate)
abbrev <- ACMG_Lit$Short_Name
abbrev_all <- ACMG_Lit_Full$Short_Name
inv.prev <- ACMG_Lit$Inverse_Prevalence %>% as.numeric %>% setNames(abbrev)
expand_pipes <- function(item) { strsplit(item, "|", fixed = T) %>% unlist }
gene.list <- expand_pipes(ACMG_Lit_Full$Gene)
report <- setNames(ACMG_Lit_Full$Variants_to_report %>% expand_pipes(), gene.list)[!duplicated(gene.list)]
inheritance <- setNames(ACMG_Lit_Full$Inheritance %>% expand_pipes(), gene.list)[!duplicated(gene.list)]
cat(sprintf("Table of Literature-Based Estimates %s x %s (selected rows/columns):", nrow(ACMG_Lit), ncol(ACMG_Lit)))
## Table of Literature-Based Estimates 22 x 17 (selected rows/columns):
ACMG_Lit[c(4,5,8,14),] %>% remove_rownames %>% 
  select(Gene, Phenotype, Abbreviation, Inverse_Prevalence, Allelic_Heterogeneity) %>% pander
Gene Phenotype Abbreviation Inverse_Prevalence
MEN1 Multiple endocrine neoplasia type 1 MEN1 30000
RET Multiple endocrine neoplasia type 2; FMTC MEN2 35000
TSC1|TSC2 Tuberous sclerosis complex TSC 11300
GLA Fabry’s Disease Fabry 1600

Table continues below

Allelic_Heterogeneity
0.9
0.98
0.9
1
acmg_ah <- ACMG_Lit$Allelic_Heterogeneity %>% as.numeric


3.3 Distribution of Prevalences

data.frame(Disease = abbrev, 
         Inverse_Prevalence = inv.prev %>% setNames(NULL)) %>%
ggplot(aes(x = factor(Disease, levels = Disease[order(Inverse_Prevalence)]), y = Inverse_Prevalence)) + 
  geom_point(stat = 'identity') + coord_flip() + xlab("Disease") + 
  scale_y_continuous(trans='log10', breaks = 10^(0:6), 
    labels = sapply(0:6, function(x) paste0("1",paste0(rep("0",x), collapse = "")))) +
  theme(axis.text.y=element_text(size=7)) + 
  ggtitle("Distribution of Inverse Prevalences (log-scale)") + ylab("Inverse Prevalence")

\newpage

3.4 Collect and Aggregate Allele Frequencies at the Disease-Level

We define AF(disease) as the probability of having at least 1 variant associated with the disease.
The frequencies across the relevant variants can be aggregated in two ways:
(1) By direct counting, from genotype data in 1000 Genomes.
(2) AF(disease) = \(1-\prod_{variant}(1-AF_{variant})\), from population data in ExAC (assumes independence).

sample_size <- list()
sample_size$Cohort_1000G <- table(map$super_pop) %>% as.numeric %>% setNames(super.levels) %>% c("1000G"=2504)
sample_size$Cohort_EXAC <- c("AFR"=5203,"AMR"=5789,"EAS"=4327,
                              "EUR"=3307+33370,"SAS"=8256,"EXAC"=60252)
sample_size$Cohort_GNOMAD <- c("AFR"=12942,"AMR"=18237,"EAS"=9472,
                               "EUR"=5081+13046+63416,"SAS"=15450,"GNOMAD"=137644)

aggregateCalc <- function(input, superpop, item, dataset, loc) {
  find = sprintf("AF_%s",toupper(dataset))
  if (superpop!=dataset) 
    find = paste(find, superpop, sep = "_")
  # Aggregation by calculation + ind assumption
  freq <- input[loc,find] %>% unlist %>% as.numeric #vector of all allele frequencies
  if (sum(freq, na.rm = T)==0) { # prob after 0 obs using Laplace's rule of succession.
    freq <- 1/(2+2*sample_size[[paste0("Cohort_",dataset)]][superpop])
  } else {
    if (inheritance[item] == "AR")
      freq <- freq^2       #AR: prob of having a non-reference site at BOTH alleles
    if (inheritance[item] %in% c("AD","SD"))
      freq <- 1-(1-freq)^2 #AD/SD: prob of having a non-reference site at EITHER allele
    if (inheritance[item] == "XL")
      freq <- freq         #XL: prob of having a non-reference site at ONE allele (male)
    freq <- 1-prod(1-freq[!is.na(freq)]) # prob of having a non-reference site in 1 chrom
  }
  return(freq)
}

aggregateCount <- function(input, superpop, item, dataset, loc) {
  # Aggregation by counting
  front_cols <- 1:(grep("HG00096",colnames(input))-1)
  find <- (1:ncol(input))[-front_cols]
  if (superpop != dataset) 
    find <- length(front_cols)+which(map$super_pop==superpop)
  if (inheritance[item] %in% c("AD","SD"))
    reduced_input <- input[loc, find]
  if (inheritance[item] == "AR")
    reduced_input <- input[loc, find]-1 #Looking for 2s
  if (inheritance[item] == "XL") {
    male <- length(front_cols)+which(map$gender=="male")
    reduced_input <- input[loc,intersect(find,male)]
  }
  freq <- mean(apply(reduced_input, 2, function(col) any(col>=1)))
  if (freq==0) 
    freq <- 1/(2+2*sample_size[["Cohort_1000G"]][superpop])
  return(freq)
}

getAlleleFreq <- function(input, ind, dataset) {
  search_list <- ACMG_Lit_Full$Gene
  search_in <- "GENE" #CLNDSDBID
  if (!("CLNSIG" %in% colnames(input)))
  input$CLNSIG <- rep(5, nrow(input))
  KP_only <- grepl(5,input$CLNSIG)
  data.frame(search_list, 
    sapply(search_list, function(item.vec) {
      item.vec.split <- expand_pipes(item.vec)
      sapply(item.vec.split, function(item) {
        loc <- grepl(item,input[,search_in], ignore.case = T)
        if (!grepl("EP",report[[item]])) #If we're only taking KP
          loc <- loc & KP_only #Take all relevant genes
        sapply(c(dataset,super.levels), function(superpop) {
          if (ind) {
            return(aggregateCalc(input, superpop, item, dataset, loc))
          } else {
            return(aggregateCount(input, superpop, item, dataset, loc))
          }
        })
      }) %>% apply(1, function(row) 1-prod(1-row)) %>% 
      setNames(sprintf("AF_%s%s",toupper(dataset), c("",paste0("_",super.levels))))
    }) %>% t %>% tbl_df
  )
}
# Other methods MIM and Tags
# Do NOT use MIM if CLNDSDBID is missing (older VCFs)
#input = ACMG.1000g[select_rows,]; ind = F; method = "Gene"; dataset = "1000G"
freq_1000g.count.gene <- getAlleleFreq(input = merged_1000g, ind = F, dataset = "1000G")
freq_1000g.calc.gene <- getAlleleFreq(input = merged_1000g, ind = T, dataset = "1000G")
freq_gnomad.calc.gene <- getAlleleFreq(input = merged_gnomad, ind = T, dataset = "GNOMAD")
freq_exac.calc.gene <- getAlleleFreq(input = merged_exac, ind = T, dataset = "EXAC")
allele.freq <- data.frame(
                   COUNT_1000G = freq_1000g.count.gene$AF_1000G, 
                   CALC_1000G = freq_1000g.calc.gene$AF_1000G, 
                   CALC_GNOMAD = freq_gnomad.calc.gene$AF_GNOMAD,
                   CALC_EXAC = freq_exac.calc.gene$AF_EXAC
  )
row.names(allele.freq) <- abbrev_all
save(freq_1000g.calc.gene, freq_1000g.count.gene, 
     freq_exac.calc.gene, freq_gnomad.calc.gene, 
     file = "Shiny_App/disease_level_AF.RData")
af_files <- c("freq_1000g.count.gene","freq_1000g.calc.gene","freq_gnomad.calc.gene","freq_exac.calc.gene")
lapply(af_files, function(file) {
  write.csv(eval(parse(text=file)), 
            file = sprintf("Supplementary_Files/Processed_Files/%s.csv",file))
}) %>% invisible
write.csv(allele.freq, file = sprintf("Supplementary_Files/Processed_Files/freq.all.gene.csv",file))


#cor(allele.freq) %>% as.data.frame %>% pander
ggplot(aes(x = CALC_1000G, y = CALC_GNOMAD), data = allele.freq) + 
  geom_point(stat = "identity", col = 'red') + 
  geom_text_repel(aes(label = abbrev_all), size = 3) + 
  scale_x_continuous(limits = c(10^-6, max(allele.freq[,"CALC_GNOMAD"]))*5, 
                     trans='log10', breaks = 10^-(0:6)) + 
  scale_y_continuous(limits = c(10^-6, max(allele.freq[,"CALC_GNOMAD"]))*5, 
                     trans='log10', breaks = 10^-(0:6)) + 
  xlab("Allele Frequency (1000 Genomes)") + ylab("Allele Frequency (gnomAD)") +
  geom_abline(slope = 1, intercept = 0) + ggtitle("Scatterplot: gnomAD v. 1000 Genomes")

\newpage

Ratio_1000G (red, top) computes AF(calculation in 1000 Genomes) / AF(counting in 1000 Genomes).
Ratio_gnomAD (blue, bottom) computes AF(calculation in gnomAD) / AF(calculation in 1000 Genomes).

#Ranked by max of the of the 2 ratios in each disease
ratio_diff <- function() {
  ratio <- data.frame(Means = allele.freq$COUNT_1000G, 
                      Ratio_1000G = (pmax(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)/
                                      pmin(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)), 
                      Ratio_gnomAD = (pmax(allele.freq$CALC_GNOMAD, allele.freq$CALC_1000G)/
                                     pmin(allele.freq$CALC_GNOMAD, allele.freq$CALC_1000G))) %>%
    mutate(Disease = factor(abbrev_all, 
           levels = abbrev_all[order(pmax(Ratio_gnomAD,Ratio_1000G))])) %>% 
    gather(Dataset, Ratio, Ratio_1000G, Ratio_gnomAD) %>%
    filter(is.finite(Ratio))
  ggplot(aes(x=Disease, y=Ratio, color = Dataset), data = ratio) + coord_flip() + 
    geom_point(stat = 'identity') + facet_wrap("Dataset", ncol = 1) + 
    ggtitle("Ratios of Allele Frequencies from Different Methods") + 
    scale_y_continuous(breaks = seq(0,100,1)) + theme(legend.position="none")
}
ratio_diff()


\newpage

Sampling 1000 variants from all variants in 1000 Genomes to test deviations from independence assumptions.
Repeat for 1000 trials and plot the distribution of disease-level allele frequencies (1000 points per disease).
Only variants with allele frequency > 0.01 are evaluated. Since we look at 17 variants per disease, the maximum is approximately \(1-(1-0.01)^{34} \approx 0.29\)

plot_ind_test <- function() {
  ind_test_data <- readRDS("Supplementary_Files/ind_test.rds")
  #set.seed(123)
  #do.call("rbind",lapply(1:1000, function(x) {
  #  print(x)
  #  lapply(ACMG.panel, function(gene){
  #    sites <- which(ACMG.1000g$GENE==gene)
  #    small_sites <- sites[ACMG.1000g[sites,"AF_1000G"] < 0.01]
  #    sample(small_sites,17)
  #  }) %>% unlist -> select_rows
  #  data.frame(DISEASE = ACMG_Lit_Full$Abbreviation,
  #             COUNT = getAlleleFreq(ACMG.1000g[select_rows,], 
  #               ind = F, dataset = "1000G")$AF_1000G, 
  #             CALC = getAlleleFreq(ACMG.1000g[select_rows,], 
  #               ind = T, dataset = "1000G")$AF_1000G)
  #})) -> ind_test_data
  #saveRDS(ind_test_data, "Supplementary_Files/ind_test.rds")
  #plot(ind_test_data %>% ggplot(aes(x=CALC-COUNT)) + geom_histogram(bins = 100) + 
  #  xlab("Calculation - Counting") + ylab("Histogram Counts"))
  #plot(ind_test_data %>% filter(COUNT>0) %>% ggplot(aes(x=pmax.int(CALC/COUNT, COUNT/CALC))) + 
  #  geom_histogram(bins = 100) + xlab("Calculation/Counting") + ylab("Histogram Counts") + xlim(0,4))
  #+ facet_wrap("DISEASE", ncol = 3))
  sapply(levels(ind_test_data$DISEASE), function(d) {
    ind_test_data %>% filter(DISEASE == d) %>% 
      mutate(DIFF = CALC-COUNT) %>% mutate(RATIO = pmax(CALC/COUNT,COUNT/CALC)) %>%
      select(RATIO) %>% unlist
  }) %>% data.frame %>% gather(Disease, Data) %>% filter(is.finite(Data)) -> plot_data
  
  plot(plot_data %>% filter(!(Disease %in% c("Wilson","HCRC.AR","OTC"))) %>% 
  ggplot(aes(x = Disease, y = Data)) + geom_boxplot() + coord_flip() + 
  ylab("Allele Frequency Difference: Calculation/Counting") + 
  ggtitle("Differences in AF Methods: by Disease"))
  
  plot(plot_data %>% filter(Disease %in% c("Wilson","HCRC.AR","OTC")) %>% 
  ggplot(aes(x = Data)) + geom_histogram(bins = 100) + 
  facet_wrap("Disease", ncol = 1, scales = "free") + 
  xlab("Allele Frequency Difference: Calculation/Counting") + 
  ggtitle("Differences in AF Methods: by Disease (Outliers)"))
  

  plot(ggplot(aes(x = COUNT, y = CALC), data = ind_test_data[runif(nrow(ind_test_data)) < 0.1,]) +
         geom_point() + ggtitle("Testing Independence with Random Sampling") + 
         scale_x_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + 
         scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + 
         xlab("Allele Frequency (From Counting)") + ylab("Allele Frequency (From Calculation)") +
         geom_abline(slope = 1, intercept = 0, color = 'blue') +
         geom_text(aes(label = replace(DISEASE,which(abs(COUNT - CALC) < 0.3), NA)), 
                   check_overlap = F, hjust = 1, na.rm = T, size = 2.5)
       )
  cat("30 diseases x 1000 points = 30,000 points.")
  cat("This plot has been downsampled 10x and contains 3,000 points.")
  ind_test_data %>% filter(COUNT>0)
}
plot_ind_out <- plot_ind_test()

## 30 diseases x 1000 points = 30,000 points.This plot has been downsampled 10x and contains 3,000 points.
sprintf("Pearson correlation: %s",
        signif(cor(plot_ind_out$COUNT,plot_ind_out$CALC),3)) %>% cat
## Pearson correlation: 0.995
sprintf("Mean ratio (Calculation/Counting): %s", 
        signif(mean(plot_ind_out$CALC/plot_ind_out$COUNT, na.rm = T),3)) %>% cat
## Mean ratio (Calculation/Counting): 0.971
rm(plot_ind_out)
\newpage

3.5 Penetrance as a Function of P(V|D)

The left end of the boxplot indicates P(V|D) = 0.01,
the bold line in the middle indicates P(V|D) = point value,
the right end of the boxplot indicates P(V|D) = 1.

if (nrow(allele.freq)==nrow(ACMG_Lit_Full))
  allele.freq <- allele.freq[ACMG_Lit_Full$Evaluate,]
get_penetrance <- function(ah_low, ah_high, dataset) {
  # Map of disease name to disease tags
  if (dataset == "1000 Genomes")
    named.freqs <- allele.freq$COUNT_1000G %>% setNames(abbrev)
  if (dataset == "gnomAD")
    named.freqs <- allele.freq$CALC_GNOMAD %>% setNames(abbrev)
  if (dataset == "ExAC")
    named.freqs <- allele.freq$CALC_EXAC %>% setNames(abbrev)
  named.prev <- 1/inv.prev %>% setNames(abbrev)
  # Repeats allow for correct quartile calculations
  #point estimate set to arithmetic mean
  allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high) %>% 
    rep(nrow(ACMG_Lit)) %>% matrix(nrow = nrow(ACMG_Lit), byrow = T)
  allelic.het[,3] <- acmg_ah
  #Functions to transform data points with disease_af = 0
  set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
  # Matrix of penetrance values for allelic het range, capped at 1
  prev_freq <- named.prev/named.freqs %>% rep(5) %>% matrix(nrow = nrow(ACMG_Lit))
  penetrance <- apply(allelic.het * prev_freq, 1, set_to_na) %>% as.data.frame
  # Take row 5 to sort by max, take colSums to sort by overall
  # Break ties by rows 3 and 1 (mean and low)
  ord <- order(penetrance[5,], penetrance[3,], penetrance[1,], decreasing = T)
  # replicate each element n times to create labels
  penetrance_data <- data.frame("Penetrance" = penetrance %>% unlist, "Disease" =
    factor(sapply(abbrev, function(x) rep(x,5)) %>% as.vector,
    levels = abbrev[ord]) )
  colormap <- rep('black', length(abbrev))
  num_na <- mean(is.na(penetrance_data$Penetrance))*length(colormap)
  if (num_na != 0)
    colormap[1:num_na] <- 'gray60'
  colormap <- rev(colormap)
  plot(
    ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
      ggtitle(sprintf("%s: Barplot of Min/Point/Max Penetrance", dataset)) + 
      theme(axis.text.y = element_text(color = colormap)) + ylim(0,1)
      #annotate("text", x = which(colormap == 'red'), y = 0, 
      #         label = "No Allele Frequency Data", hjust = 0, size = 3)
  )
  penetrance_data
}
pen_gnomad <- get_penetrance(ah_low = 0.01, ah_high = 1, dataset = "gnomAD")

pen_1000g <-  get_penetrance(ah_low = 0.01, ah_high = 1, dataset = "1000 Genomes")

#ah_low = 0.01; ah_high = 1; dataset = "gnomAD"


Note: Some diseases have mean theoretical penetrance = 1 because the assumed allelic heterogeneity is greater than is possible, given the observed prevalence and allele frequencies.

\newpage

3.6 Penetrance Estimates by Ancestry

ancestry_penetrance <- function(ah_low, ah_high, dataset, range, position) {
  pos <- replace(c(F,F,F,F,F), ifelse(position == "Max", 5, 3), T)
  sapply(c("Total",super.levels), function(superpop){
    # Map of disease name to disease tags
    find <- paste0("AF_", toupper(dataset))
    named.freqs <- eval(parse(text=sprintf("freq_%s.calc.gene", tolower(dataset))))[ACMG_Lit_Full$Evaluate,]
    if (superpop != "Total") 
      find <- paste(find, superpop, sep = "_")
    named.freqs <- named.freqs[,find] %>% unlist %>% setNames(abbrev)
    allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high) %>% 
      rep(nrow(ACMG_Lit)) %>% matrix(nrow = nrow(ACMG_Lit), byrow = T)
    allelic.het[,3] <- acmg_ah
    #make_prev_ranges(range) -> prev_final
    # Matrix of penetrance values for allelic het range, capped at 1
    set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
    apply(allelic.het / ACMG_Lit$Inverse_Prevalence / named.freqs, 1, set_to_na) %>% unlist
  }) %>% as.data.frame -> penetrance_init
  ### Quantifying ancestral variation
  #temp <- penetrance_init[pos,]
  #sort(apply(temp, 1, max, na.rm = T) - apply(temp, 1, min, na.rm = T))
  #sort(apply(temp, 1, max, na.rm = T) / apply(temp, 1, min, na.rm = T))
  # Take column 5 to sort by max, take rowSums to sort by overall
  # Break ties by rows 3 and 1 (mean and low)
  mat_pen <- matrix(penetrance_init$Total, ncol = 5, byrow = T)
  ord <- order(mat_pen[,5], mat_pen[,3], mat_pen[,1], decreasing = T)
  penetrance_data <- data.frame(penetrance_init, 
                                "Disease" = factor(sapply(abbrev, 
                                function(x) rep(x,5)) %>% as.vector,
                                levels = abbrev[ord]) ) 
  #m <- list(l = 170, r = 220, b = -50, t = 50, pad = 5)
  #  heatmap <- plot_ly(
  #    x = factor(c(super.levels,"Total"), levels = c("Total",super.levels)),
  #    y = factor(sapply(abbrev, function(x) rep(x,5)) %>% as.vector, levels = abbrev[ord]),
  #    z = penetrance_init[pos,][ord,] %>% as.matrix, type = "heatmap", height = 700
  #  ) %>% layout(autosize = T, margin = m); heatmap
  # Star/Radar Plot
  temp <- penetrance_data[pos,] %>% select(-Disease, -Total)
  rownames(temp) <- abbrev
  order(rowSums(temp, na.rm = T), decreasing = T)[1:10] -> wanted
  col <- 3
  stars(temp[wanted,], scale = F, full = F, len = 1, nrow = ceiling(10/col), ncol = col, 
        flip.labels = F, key.loc = c(2*col,2), 
        main = sprintf("Radar Plot: %s Penetrance by Ancestry (%s)", position, dataset),
        draw.segments = T, col.segments = c('red','yellow','green','blue','purple'))
  print("These are the top 10 diseases by summed allele frequencies. NULL values are not plotted.", quote = F)
  print("Each radius is proportional to the penetrance of the disease in the given population.", quote = F)
  # Barplot
  penetrance_data <- gather(penetrance_data, Subset, Penetrance, -Disease)
  plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
    facet_wrap(~Subset, ncol = 2) + ggtitle(sprintf("Barplot: Penetrance by Ancestry (%s)", dataset)) + 
    theme(axis.text.y=element_text(size=6), axis.text.x = element_text(angle = -20, hjust = 0.4))
  )
  # Heatmap
  plot(ggplot(aes(x=Disease, y = Subset), data = penetrance_data[pos,]) + coord_flip() + 
    geom_tile(aes(fill = Penetrance), color = 'white') + xlab("Disease") + ylab("Ancestry") +
    scale_fill_gradient(low='white',high = 'darkblue', na.value = "grey50",
      breaks=c(0,0.25,0.5,0.75,1), labels=c("0","0.25","0.50","0.75","1.00"), limits =c(0,1)) + 
    ggtitle(sprintf("Heatmap: %s Penetrance by Ancestry (%s)", position, dataset)) + 
    theme_minimal() + theme(axis.ticks = element_blank()) + 
    annotate("segment", y=c(0.5,5.5,6.5), yend=c(0.5,5.5,6.5), 
             x=0.5, xend = length(abbrev)+0.5) +
    annotate("segment", y=0.5, yend=6.5, 
             x=c(0.5,length(abbrev)+0.5), 
             xend = c(0.5,length(abbrev)+0.5))
  )
  cat("Dark gray boxes are NA: no associated variants discovered in that ancestral population.")
}
ancestry_penetrance(ah_low = 0.01, ah_high = 1, dataset = "gnomAD", range = 5, position = "Max")

## [1] These are the top 10 diseases by summed allele frequencies. NULL values are not plotted.
## [1] Each radius is proportional to the penetrance of the disease in the given population.

## Dark gray boxes are NA: no associated variants discovered in that ancestral population.
#ancestry_penetrance(ah_low = 0.01, ah_high = 1, dataset = "1000G", range = 5, position = "Max")
#ah_low = 0.01; ah_high = 1; dataset = "gnomAD"; range = 5; position = "Max"
## Empirical CDFs for All Penetrance Plots
pen_1000g$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% 
  plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P", 
       main = "CDF: Penetrance ~ P(V|D)", ylim = c(0,1.02), pch = 19)
pen_gnomad$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("gnomAD","1000G"), col = c("red","black"), pch = c(1,19))
download.file(url = "http://www.omim.org/static/omim/data/mim2gene.txt", destfile = "Supplementary_Files/mim2gene.txt", method = "internal")
mim2gene <- read.table(file = "Supplementary_Files/mim2gene.txt", header = FALSE, sep = "\t", stringsAsFactors = F)
colnames(mim2gene) <- c("MIM_Number","MIM_Entry_Type","Entrez_Gene_ID (NCBI)","Approved_Gene_Symbol_(HGNC)","Ensembl_Gene_ID_(Ensembl)")
mim2gene <- mim2gene[mim2gene$`Approved_Gene_Symbol_(HGNC)`!="",]
ACMG.ENSG <- mim2gene[mim2gene$`Approved_Gene_Symbol_(HGNC)` %in% ACMG.table$Gene_Name,
                      "Ensembl_Gene_ID_(Ensembl)"] %>% strsplit(",") %>% sapply("[",1)
mybpc3.page <- scrape(url="http://gnomad.broadinstitute.org/gene/ENSG00000134571")[[1]]
mybpc3.page <- scrape(url="http://exac.broadinstitute.org/gene/ENSG00000134571")[[1]]
    mybpc3.table <- readHTMLTable(mybpc3.page, stringsAsFactors = F, header = T)
    colnames(mybpc3.table) <- NULL
## Test Statistics for Ancestral Differences

#F-statistic/T-statistic: probability that the different groups are sampled from distributions with the same mean. <br />
#These plots are from 4(a) - 1000 Genomes Fraction with 1+ Non-Reference Site, but can be replicated for plots 2(ab) and 3(ab) as well. 

#Calculating test statistics (F-values)
Fcalc <- function(values, pop) {
  if (missing(pop)) {
    groups <- super[pop.levels]
  } else {
    groups <- ifelse(super[pop.levels]==pop,pop,"Other")
  }
  data <- data.frame(y = values, group = factor(groups))
  color_map <- c("red","gold3","springgreen3","deepskyblue","violet","white") %>% 
    setNames(c("AFR","AMR","EAS","EUR","SAS","Other"))
  out <- lm(y ~ group, data) %>% anova
  plot(y ~ group, data, xlab = NULL, ylab = NULL, 
       col = color_map[sort(unique(groups))], 
       main = paste("F-statistic:",out$`Pr(>F)`[1] %>% signif(3)))
  out
}
par(mfrow=c(2,3))
F_values <- c(Fcalc(values$Mean)$`Pr(>F)`[1] %>% setNames("Overall"), 
              sapply(super.levels, function(pop) {
                Fcalc(values$Mean, pop)$`Pr(>F)`[1]
              }))
par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)
#F_values